Import required packages¶

In [ ]:
import pickle
from pathlib import Path

import dtreeviz
import ee
import geemap
import geojson
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
import plotly.express as px
import scipy.interpolate
import shap
from cuml.explainer import PermutationExplainer
from cuml.model_selection import train_test_split as cuml_train_test_split
from optuna.integration.lightgbm import LightGBMTunerCV
from optuna_integration import LightGBMPruningCallback
from plotly import graph_objects as go
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    roc_auc_score,
)
from sklearn.model_selection import train_test_split

Second enviroment import due to environment conflict between RAPIDSAI and Pytorch-tabnet

In [ ]:
# import lightgbm as lgb
# import numpy as np
# import pandas as pd
# import torch
# from pytorch_tabnet.tab_model import TabNetClassifier
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import (
#     accuracy_score,
#     precision_recall_fscore_support,
#     roc_auc_score,
# )
# from sklearn.model_selection import train_test_split
In [ ]:
ee.Authenticate()
ee.Initialize()
Map = geemap.Map()
Map.add_basemap("HYBRID")

Exploratory Data Analysis (EDA)¶

Data Collection¶

Dataset used:

  • Global Flood Database v1 (2000-2018)

  • USGS Landsat 7 Level 2, Collection 2, Tier 1

  • SRTM Digital Elevation Data Version 4

In [ ]:
flood_collection: ee.ImageCollection = ee.ImageCollection(
    "GLOBAL_FLOOD_DB/MODIS_EVENTS/V1"
)

landsat_collection: ee.ImageCollection = ee.ImageCollection(
    "LANDSAT/LE07/C02/T1_L2"
).filterDate("2000-02-17", "2018-12-10")
elevation_image = ee.Image("CGIAR/SRTM90_V4")

print("Image structure of each collection:")
display(flood_collection.first(), landsat_collection.first(), elevation_image)
Image structure of each collection:
    • type:Image
    • id:GLOBAL_FLOOD_DB/MODIS_EVENTS/V1/DFO_1586_From_20000218_to_20000301
    • version:1685079690200045
        • id:flooded
        • crs:EPSG:4326
          • 0:0.002245788210298804
          • 1:0
          • 2:137.20418491999513
          • 3:0
          • 4:-0.002245788210298804
          • 5:-17.514902252120372
          • type:PixelType
          • max:255
          • min:0
          • precision:int
          • 0:5686
          • 1:8984
        • id:duration
        • crs:EPSG:4326
          • 0:0.002245788210298804
          • 1:0
          • 2:137.20418491999513
          • 3:0
          • 4:-0.002245788210298804
          • 5:-17.514902252120372
          • type:PixelType
          • max:65535
          • min:0
          • precision:int
          • 0:5686
          • 1:8984
        • id:clear_views
        • crs:EPSG:4326
          • 0:0.002245788210298804
          • 1:0
          • 2:137.20418491999513
          • 3:0
          • 4:-0.002245788210298804
          • 5:-17.514902252120372
          • type:PixelType
          • max:65535
          • min:0
          • precision:int
          • 0:5686
          • 1:8984
        • id:clear_perc
        • crs:EPSG:4326
          • 0:0.002245788210298804
          • 1:0
          • 2:137.20418491999513
          • 3:0
          • 4:-0.002245788210298804
          • 5:-17.514902252120372
          • type:PixelType
          • precision:float
          • 0:5686
          • 1:8984
        • id:jrc_perm_water
        • crs:EPSG:4326
          • 0:0.002245788210298804
          • 1:0
          • 2:137.20418491999513
          • 3:0
          • 4:-0.002245788210298804
          • 5:-17.514902252120372
          • type:PixelType
          • max:255
          • min:0
          • precision:int
          • 0:5686
          • 1:8984
      • id:1586
      • cc:AUS
      • composite_type:3Day
      • countries:Australia
      • dfo_centroid_x:143.6978
      • dfo_centroid_y:-31.268059
      • dfo_country:Australia
      • dfo_dead:1
      • dfo_displaced:200
      • dfo_main_cause:Monsoonal rain
      • dfo_other_country:NA
      • dfo_severity:2
      • dfo_validation_type:News
      • gfd_country_code:['AS']
      • gfd_country_name:['AUSTRALIA']
      • glide_index:NA
      • otsu_sample_res:231.66
      • slope_threshold:5
      • system:asset_size:8988914
        • type:LinearRing
            • 0:148.77567883770345
            • 1:-37.692190384571404
            • 0:149.9754567941598
            • 1:-37.692185550952246
            • 0:149.9749728427817
            • 1:-17.513770729195038
            • 0:149.17465716800353
            • 1:-17.51377546120134
            • 0:147.97772223305088
            • 1:-17.512579461230274
            • 0:146.3818089465066
            • 1:-17.512579422994158
            • 0:144.7858956959944
            • 1:-17.512579420723743
            • 0:143.1899824714047
            • 1:-17.512579400063284
            • 0:141.59406922326394
            • 1:-17.512579402044587
            • 0:139.99815600423668
            • 1:-17.5125794432258
            • 0:138.40224272344366
            • 1:-17.512579385808014
            • 0:137.20294874528088
            • 1:-17.51377068144409
            • 0:137.20246484353038
            • 1:-37.69218554664975
            • 0:139.59917774865693
            • 1:-37.692190354915816
            • 0:141.1950909277643
            • 1:-37.69219034319341
            • 0:143.58896079571315
            • 1:-37.69219036366114
            • 0:145.18487408134828
            • 1:-37.69219039048359
            • 0:147.17976561083674
            • 1:-37.692190365837604
            • 0:148.77567883770345
            • 1:-37.692190384571404
      • system:index:DFO_1586_From_20000218_to_20000301
      • system:time_end:951868800000
      • system:time_start:950832000000
      • threshold_b1b2:0.711
      • threshold_b7:1815.18
      • threshold_type:otsu
    • type:Image
    • id:LANDSAT/LE07/C02/T1_L2/LE07_001004_20000610
    • version:1612545591275537
        • id:SR_B1
        • crs:EPSG:32628
          • 0:30
          • 1:0
          • 2:337185
          • 3:0
          • 4:-30
          • 5:8808915
          • type:PixelType
          • max:65535
          • min:0
          • precision:int
          • 0:9101
          • 1:9011
        • id:SR_B2
        • crs:EPSG:32628
          • 0:30
          • 1:0
          • 2:337185
          • 3:0
          • 4:-30
          • 5:8808915
          • type:PixelType
          • max:65535
          • min:0
          • precision:int
          • 0:9101
          • 1:9011
        • id:SR_B3
        • crs:EPSG:32628
          • 0:30
          • 1:0
          • 2:337185
          • 3:0
          • 4:-30
          • 5:8808915
          • type:PixelType
          • max:65535
          • min:0
          • precision:int
          • 0:9101
          • 1:9011
        • id:SR_B4
        • crs:EPSG:32628
          • 0:30
          • 1:0
          • 2:337185
          • 3:0
          • 4:-30
          • 5:8808915
          • type:PixelType
          • max:65535
          • min:0
          • precision:int
          • 0:9101
          • 1:9011
        • id:SR_B5
        • crs:EPSG:32628
          • 0:30
          • 1:0
          • 2:337185
          • 3:0
          • 4:-30
          • 5:8808915
          • type:PixelType
          • max:65535
          • min:0
          • precision:int
          • 0:9101
          • 1:9011
        • id:SR_B7
        • crs:EPSG:32628
          • 0:30
          • 1:0
          • 2:337185
          • 3:0
          • 4:-30
          • 5:8808915
          • type:PixelType
          • max:65535
          • min:0
          • precision:int
          • 0:9101
          • 1:9011
        • id:SR_ATMOS_OPACITY
        • crs:EPSG:32628
          • 0:30
          • 1:0
          • 2:337185
          • 3:0
          • 4:-30
          • 5:8808915
          • type:PixelType
          • max:32767
          • min:-32768
          • precision:int
          • 0:9101
          • 1:9011
        • id:SR_CLOUD_QA
        • crs:EPSG:32628
          • 0:30
          • 1:0
          • 2:337185
          • 3:0
          • 4:-30
          • 5:8808915
          • type:PixelType
          • max:255
          • min:0
          • precision:int
          • 0:9101
          • 1:9011
        • id:ST_B6
        • crs:EPSG:32628
          • 0:30
          • 1:0
          • 2:337185
          • 3:0
          • 4:-30
          • 5:8808915
          • type:PixelType
          • max:65535
          • min:0
          • precision:int
          • 0:9101
          • 1:9011
        • id:ST_ATRAN
        • crs:EPSG:32628
          • 0:30
          • 1:0
          • 2:337185
          • 3:0
          • 4:-30
          • 5:8808915
          • type:PixelType
          • max:32767
          • min:-32768
          • precision:int
          • 0:9101
          • 1:9011
        • id:ST_CDIST
        • crs:EPSG:32628
          • 0:30
          • 1:0
          • 2:337185
          • 3:0
          • 4:-30
          • 5:8808915
          • type:PixelType
          • max:32767
          • min:-32768
          • precision:int
          • 0:9101
          • 1:9011
        • id:ST_DRAD
        • crs:EPSG:32628
          • 0:30
          • 1:0
          • 2:337185
          • 3:0
          • 4:-30
          • 5:8808915
          • type:PixelType
          • max:32767
          • min:-32768
          • precision:int
          • 0:9101
          • 1:9011
        • id:ST_EMIS
        • crs:EPSG:32628
          • 0:30
          • 1:0
          • 2:337185
          • 3:0
          • 4:-30
          • 5:8808915
          • type:PixelType
          • max:32767
          • min:-32768
          • precision:int
          • 0:9101
          • 1:9011
        • id:ST_EMSD
        • crs:EPSG:32628
          • 0:30
          • 1:0
          • 2:337185
          • 3:0
          • 4:-30
          • 5:8808915
          • type:PixelType
          • max:32767
          • min:-32768
          • precision:int
          • 0:9101
          • 1:9011
        • id:ST_QA
        • crs:EPSG:32628
          • 0:30
          • 1:0
          • 2:337185
          • 3:0
          • 4:-30
          • 5:8808915
          • type:PixelType
          • max:32767
          • min:-32768
          • precision:int
          • 0:9101
          • 1:9011
        • id:ST_TRAD
        • crs:EPSG:32628
          • 0:30
          • 1:0
          • 2:337185
          • 3:0
          • 4:-30
          • 5:8808915
          • type:PixelType
          • max:32767
          • min:-32768
          • precision:int
          • 0:9101
          • 1:9011
        • id:ST_URAD
        • crs:EPSG:32628
          • 0:30
          • 1:0
          • 2:337185
          • 3:0
          • 4:-30
          • 5:8808915
          • type:PixelType
          • max:32767
          • min:-32768
          • precision:int
          • 0:9101
          • 1:9011
        • id:QA_PIXEL
        • crs:EPSG:32628
          • 0:30
          • 1:0
          • 2:337185
          • 3:0
          • 4:-30
          • 5:8808915
          • type:PixelType
          • max:65535
          • min:0
          • precision:int
          • 0:9101
          • 1:9011
        • id:QA_RADSAT
        • crs:EPSG:32628
          • 0:30
          • 1:0
          • 2:337185
          • 3:0
          • 4:-30
          • 5:8808915
          • type:PixelType
          • max:65535
          • min:0
          • precision:int
          • 0:9101
          • 1:9011
      • ALGORITHM_SOURCE_SURFACE_REFLECTANCE:LEDAPS_3.4.0
      • ALGORITHM_SOURCE_SURFACE_TEMPERATURE:st_1.3.0
      • CLOUD_COVER:0
      • CLOUD_COVER_LAND:0
      • COLLECTION_CATEGORY:T1
      • COLLECTION_NUMBER:2
      • CORRECTION_BIAS_BAND_1:INTERNAL_CALIBRATION
      • CORRECTION_BIAS_BAND_2:INTERNAL_CALIBRATION
      • CORRECTION_BIAS_BAND_3:INTERNAL_CALIBRATION
      • CORRECTION_BIAS_BAND_4:INTERNAL_CALIBRATION
      • CORRECTION_BIAS_BAND_5:INTERNAL_CALIBRATION
      • CORRECTION_BIAS_BAND_6_VCID_1:INTERNAL_CALIBRATION
      • CORRECTION_BIAS_BAND_6_VCID_2:INTERNAL_CALIBRATION
      • CORRECTION_BIAS_BAND_7:INTERNAL_CALIBRATION
      • CORRECTION_BIAS_BAND_8:INTERNAL_CALIBRATION
      • CORRECTION_GAIN_BAND_1:CPF
      • CORRECTION_GAIN_BAND_2:CPF
      • CORRECTION_GAIN_BAND_3:CPF
      • CORRECTION_GAIN_BAND_4:CPF
      • CORRECTION_GAIN_BAND_5:CPF
      • CORRECTION_GAIN_BAND_6_VCID_1:CPF
      • CORRECTION_GAIN_BAND_6_VCID_2:CPF
      • CORRECTION_GAIN_BAND_7:CPF
      • CORRECTION_GAIN_BAND_8:CPF
      • DATA_SOURCE_AIR_TEMPERATURE:NCEP
      • DATA_SOURCE_ELEVATION:GLS2000
      • DATA_SOURCE_OZONE:TOMS
      • DATA_SOURCE_PRESSURE:NCEP
      • DATA_SOURCE_REANALYSIS:GEOS-5 FP-IT
      • DATA_SOURCE_WATER_VAPOR:NCEP
      • DATE_ACQUIRED:2000-06-10
      • DATE_PRODUCT_GENERATED:1600413331000
      • DATUM:WGS84
      • EARTH_SUN_DISTANCE:1.0153425
      • ELLIPSOID:WGS84
      • EPHEMERIS_TYPE:DEFINITIVE
      • GAIN_BAND_1:H
      • GAIN_BAND_2:H
      • GAIN_BAND_3:H
      • GAIN_BAND_4:H
      • GAIN_BAND_5:H
      • GAIN_BAND_6_VCID_1:L
      • GAIN_BAND_6_VCID_2:H
      • GAIN_BAND_7:H
      • GAIN_BAND_8:L
      • GAIN_CHANGE_BAND_1:HH
      • GAIN_CHANGE_BAND_2:HH
      • GAIN_CHANGE_BAND_3:HH
      • GAIN_CHANGE_BAND_4:HH
      • GAIN_CHANGE_BAND_5:HH
      • GAIN_CHANGE_BAND_6_VCID_1:LL
      • GAIN_CHANGE_BAND_6_VCID_2:HH
      • GAIN_CHANGE_BAND_7:HH
      • GAIN_CHANGE_BAND_8:LL
      • GAIN_CHANGE_SCAN_BAND_1:0
      • GAIN_CHANGE_SCAN_BAND_2:0
      • GAIN_CHANGE_SCAN_BAND_3:0
      • GAIN_CHANGE_SCAN_BAND_4:0
      • GAIN_CHANGE_SCAN_BAND_5:0
      • GAIN_CHANGE_SCAN_BAND_6_VCID_1:0
      • GAIN_CHANGE_SCAN_BAND_6_VCID_2:0
      • GAIN_CHANGE_SCAN_BAND_7:0
      • GAIN_CHANGE_SCAN_BAND_8:0
      • GEOMETRIC_RMSE_MODEL:9.307
      • GEOMETRIC_RMSE_MODEL_X:5.995
      • GEOMETRIC_RMSE_MODEL_Y:7.119
      • GRID_CELL_SIZE_REFLECTIVE:30
      • GRID_CELL_SIZE_THERMAL:30
      • GROUND_CONTROL_POINTS_MODEL:23
      • GROUND_CONTROL_POINTS_VERSION:5
      • IMAGE_QUALITY:9
      • L1_DATE_PRODUCT_GENERATED:2020-09-18T06:45:47Z
      • L1_LANDSAT_PRODUCT_ID:LE07_L1TP_001004_20000610_20200918_02_T1
      • L1_PROCESSING_LEVEL:L1TP
      • L1_PROCESSING_SOFTWARE_VERSION:LPGS_15.3.1c
      • L1_REQUEST_ID:L2
      • LANDSAT_PRODUCT_ID:LE07_L2SP_001004_20000610_20200918_02_T1
      • LANDSAT_SCENE_ID:LE70010042000162SGS00
      • MAP_PROJECTION:UTM
      • ORIENTATION:NORTH_UP
      • PROCESSING_LEVEL:L2SP
      • PROCESSING_SOFTWARE_VERSION:LPGS_15.3.1c
      • REFLECTANCE_ADD_BAND_1:-0.2
      • REFLECTANCE_ADD_BAND_2:-0.2
      • REFLECTANCE_ADD_BAND_3:-0.2
      • REFLECTANCE_ADD_BAND_4:-0.2
      • REFLECTANCE_ADD_BAND_5:-0.2
      • REFLECTANCE_ADD_BAND_7:-0.2
      • REFLECTANCE_MULT_BAND_1:2.75e-05
      • REFLECTANCE_MULT_BAND_2:2.75e-05
      • REFLECTANCE_MULT_BAND_3:2.75e-05
      • REFLECTANCE_MULT_BAND_4:2.75e-05
      • REFLECTANCE_MULT_BAND_5:2.75e-05
      • REFLECTANCE_MULT_BAND_7:2.75e-05
      • REFLECTIVE_LINES:9011
      • REFLECTIVE_SAMPLES:9101
      • REQUEST_ID:L2
      • SATURATION_BAND_1:Y
      • SATURATION_BAND_2:Y
      • SATURATION_BAND_3:Y
      • SATURATION_BAND_4:Y
      • SATURATION_BAND_5:N
      • SATURATION_BAND_6_VCID_1:N
      • SATURATION_BAND_6_VCID_2:N
      • SATURATION_BAND_7:N
      • SATURATION_BAND_8:N
      • SCENE_CENTER_TIME:14:00:14.7523815Z
      • SENSOR_ANOMALIES:NONE
      • SENSOR_ID:ETM
      • SENSOR_MODE:SAM
      • SENSOR_MODE_SLC:ON
      • SPACECRAFT_ID:LANDSAT_7
      • STATION_ID:SGS
      • SUN_AZIMUTH:-164.3367295
      • SUN_ELEVATION:34.53465096
      • TEMPERATURE_ADD_BAND_ST_B6:149
      • TEMPERATURE_MAXIMUM_BAND_ST_B6:372.999941
      • TEMPERATURE_MINIMUM_BAND_ST_B6:149.003418
      • TEMPERATURE_MULT_BAND_ST_B6:0.00341802
      • THERMAL_LINES:9011
      • THERMAL_SAMPLES:9101
      • UTM_ZONE:28
      • WRS_PATH:1
      • WRS_ROW:4
      • WRS_TYPE:2
      • system:asset_size:263744119
        • type:LinearRing
            • 0:-10.798987380789246
            • 1:78.01858268319442
            • 0:-10.411971358532213
            • 1:78.087262939023
            • 0:-10.405192241584777
            • 1:78.09033737226109
            • 0:-10.45002686192686
            • 1:78.10115918140148
            • 0:-13.32072524601852
            • 1:78.7326873858347
            • 0:-15.245031539703183
            • 1:79.10582796599041
            • 0:-16.33166698497307
            • 1:79.30080490370658
            • 0:-16.352886368757954
            • 1:79.29918418754635
            • 0:-16.38707288609935
            • 1:79.29308175095046
            • 0:-21.80626662764263
            • 1:78.07770870627853
            • 0:-21.83782445545615
            • 1:78.0686748141708
            • 0:-21.846477474236046
            • 1:78.0643237332263
            • 0:-21.724798617349382
            • 1:78.0443847852502
            • 0:-18.016971684322044
            • 1:77.38452706148878
            • 0:-15.977762221404255
            • 1:76.96665695958274
            • 0:-15.96255873558572
            • 1:76.96844307081405
            • 0:-15.920977871524345
            • 1:76.97729523454795
            • 0:-14.553805105872115
            • 1:77.28118851198714
            • 0:-13.06187389601564
            • 1:77.59055217371134
            • 0:-11.819086610724035
            • 1:77.83121753786516
            • 0:-11.497264139900494
            • 1:77.89133707865153
            • 0:-10.798987380789246
            • 1:78.01858268319442
      • system:index:LE07_001004_20000610
      • system:time_end:960645614752
      • system:time_start:960645614752
    • type:Image
    • id:CGIAR/SRTM90_V4
    • version:1641990053291277
        • id:elevation
        • crs:EPSG:4326
          • 0:0.000833333333333
          • 1:0
          • 2:-180
          • 3:0
          • 4:-0.000833333333333
          • 5:60
          • type:PixelType
          • max:32767
          • min:-32768
          • precision:int
          • 0:432000
          • 1:144000
        • 0:950227200000
        • 1:951177600000
      • description:

        The Shuttle Radar Topography Mission (SRTM) digital elevation dataset was originally produced to provide consistent, high-quality elevation data at near global scope. This version of the SRTM digital elevation data has been processed to fill data voids, and to facilitate its ease of use.

        Provider: NASA/CGIAR

        Bands

        NameDescription
        elevation

        Elevation

        Terms of Use

        DISTRIBUTION. Users are prohibited from any commercial, non-free resale, or redistribution without explicit written permission from CIAT. Users should acknowledge CIAT as the source used in the creation of any reports, publications, new datasets, derived products, or services resulting from the use of this dataset. CIAT also request reprints of any publications and notification of any redistributing efforts. For commercial access to the data, send requests to Andy Jarvis.

        NO WARRANTY OR LIABILITY. CIAT provides these data without any warranty of any kind whatsoever, either express or implied, including warranties of merchantability and fitness for a particular purpose. CIAT shall not be liable for incidental, consequential, or special damages arising out of the use of any data.

        ACKNOWLEDGMENT AND CITATION. Any users are kindly asked to cite this data in any published material produced using this data, and if possible link web pages to the CIAT-CSI SRTM website.

        Suggested citation(s)

        • Jarvis, A., H.I. Reuter, A. Nelson, E. Guevara. 2008. Hole-filled SRTM for the globe Version 4, available from the CGIAR-CSI SRTM 90m Database: https://srtm.csi.cgiar.org.

        • 0:cgiar
        • 1:dem
        • 2:elevation
        • 3:geophysical
        • 4:srtm
        • 5:topography
      • period:0
        • 0:srtm
        • 1:elevation
        • 2:topography
        • 3:dem
        • 4:geophysical
      • provider:NASA/CGIAR
      • provider_url:https://srtm.csi.cgiar.org/
      • sample:https://mw1.google.com/ges/dd/images/SRTM90_V4_sample.png
        • 0:cgiar
      • system:asset_size:18827626666
      • system:time_end:951177600000
      • system:time_start:950227200000
      • system:visualization_0_bands:elevation
      • system:visualization_0_gamma:1.6
      • system:visualization_0_max:8000.0
      • system:visualization_0_min:-100.0
      • system:visualization_0_name:Elevation
        • 0:cgiar
        • 1:dem
        • 2:elevation
        • 3:geophysical
        • 4:srtm
        • 5:topography
      • thumb:https://mw1.google.com/ges/dd/images/SRTM90_V4_thumb.png
      • title:SRTM Digital Elevation Data Version 4
      • type_name:Image
      • visualization_0_bands:elevation
      • visualization_0_max:8000.0
      • visualization_0_min:-100.0
      • visualization_0_name:Elevation

Mask the images with conditions:

  • Clear percentage of flood images >= 0.5

  • Not permanent water

In [ ]:
def mask_flood(image):
    clear_perc_mask = image.select("clear_perc").gte(0.5)  # Clear percentage >= 50%
    jrc_perm_water_mask = image.select("jrc_perm_water").eq(
        0
    )  # 0 - non-water | 1 - permanent water

    combined_mask = clear_perc_mask.And(jrc_perm_water_mask)

    masked_image = image.updateMask(combined_mask)

    return masked_image


flood_collection = flood_collection.map(mask_flood)

Compose the following bands into an image collection:

  • x, y, elevation as z and slope

  • SR_B1, SR_B2, SR_B3, SR_B4, SR_B5, SR_B7 from Landsat Collection

  • NDWI calculated using SR_B2 and SR_B4

  • duration, clear_perc, flooded from Global Flood Database

In [ ]:
landsat_interest_bands = ["SR_B1", "SR_B2", "SR_B3", "SR_B4", "SR_B5", "SR_B7"]
flood_interest_bands = ["duration", "clear_perc", "flooded"]

elevation = elevation_image.select("elevation").rename("z")
slope = ee.Terrain.slope(elevation_image)


def filter_single_scene(image):
    date_range = ee.DateRange(
        image.get("system:time_start"), image.get("system:time_end")
    )
    return ee.Algorithms.If(
        date_range.end().difference(date_range.start(), "days").gt(0), image, None
    )


def compose_landsat(image):
    geometry = image.geometry()
    date_range = ee.DateRange(
        image.get("system:time_start"), image.get("system:time_end")
    )
    landsat_image = (
        landsat_collection.select(ee.List(landsat_interest_bands))
        .filterDate(date_range.start(), date_range.end())
        .filterBounds(geometry)
        .median()
    )
    return ee.Algorithms.If(
        landsat_image.bandNames().length(),
        image.select(flood_interest_bands).addBands(
            landsat_image,
            ee.List(landsat_interest_bands),
        ),
        None,
    )


def add_elevation_bands(image: ee.Image):
    return image.addBands(elevation).addBands(slope)


def set_default_projection(image: ee.Image):
    projection = image.select("flooded").projection()
    return image.setDefaultProjection(projection)


images = (
    flood_collection.map(filter_single_scene, True)
    .map(compose_landsat, True)
    .map(add_elevation_bands)
    .map(set_default_projection)
)
print(f"Number of flood event images left: {images.size().getInfo()}")
images.first()
Number of flood event images left: 848
Out[ ]:
    • type:Image
    • id:GLOBAL_FLOOD_DB/MODIS_EVENTS/V1/DFO_1586_From_20000218_to_20000301
    • version:1685079690200045
        • id:duration
        • crs:EPSG:4326
          • 0:0.002245788210298804
          • 1:0
          • 2:137.20418491999513
          • 3:0
          • 4:-0.002245788210298804
          • 5:-17.514902252120372
          • type:PixelType
          • max:65535
          • min:0
          • precision:int
          • 0:5686
          • 1:8984
        • id:clear_perc
        • crs:EPSG:4326
          • 0:0.002245788210298804
          • 1:0
          • 2:137.20418491999513
          • 3:0
          • 4:-0.002245788210298804
          • 5:-17.514902252120372
          • type:PixelType
          • precision:float
          • 0:5686
          • 1:8984
        • id:flooded
        • crs:EPSG:4326
          • 0:0.002245788210298804
          • 1:0
          • 2:137.20418491999513
          • 3:0
          • 4:-0.002245788210298804
          • 5:-17.514902252120372
          • type:PixelType
          • max:255
          • min:0
          • precision:int
          • 0:5686
          • 1:8984
        • id:SR_B1
        • crs:EPSG:4326
          • 0:0.002245788210298804
          • 1:0
          • 2:137.20418491999513
          • 3:0
          • 4:-0.002245788210298804
          • 5:-17.514902252120372
          • type:PixelType
          • max:65535
          • min:0
          • precision:double
        • id:SR_B2
        • crs:EPSG:4326
          • 0:0.002245788210298804
          • 1:0
          • 2:137.20418491999513
          • 3:0
          • 4:-0.002245788210298804
          • 5:-17.514902252120372
          • type:PixelType
          • max:65535
          • min:0
          • precision:double
        • id:SR_B3
        • crs:EPSG:4326
          • 0:0.002245788210298804
          • 1:0
          • 2:137.20418491999513
          • 3:0
          • 4:-0.002245788210298804
          • 5:-17.514902252120372
          • type:PixelType
          • max:65535
          • min:0
          • precision:double
        • id:SR_B4
        • crs:EPSG:4326
          • 0:0.002245788210298804
          • 1:0
          • 2:137.20418491999513
          • 3:0
          • 4:-0.002245788210298804
          • 5:-17.514902252120372
          • type:PixelType
          • max:65535
          • min:0
          • precision:double
        • id:SR_B5
        • crs:EPSG:4326
          • 0:0.002245788210298804
          • 1:0
          • 2:137.20418491999513
          • 3:0
          • 4:-0.002245788210298804
          • 5:-17.514902252120372
          • type:PixelType
          • max:65535
          • min:0
          • precision:double
        • id:SR_B7
        • crs:EPSG:4326
          • 0:0.002245788210298804
          • 1:0
          • 2:137.20418491999513
          • 3:0
          • 4:-0.002245788210298804
          • 5:-17.514902252120372
          • type:PixelType
          • max:65535
          • min:0
          • precision:double
        • id:z
        • crs:EPSG:4326
          • 0:0.002245788210298804
          • 1:0
          • 2:137.20418491999513
          • 3:0
          • 4:-0.002245788210298804
          • 5:-17.514902252120372
          • type:PixelType
          • max:32767
          • min:-32768
          • precision:int
          • 0:160302
          • 1:53434
          • 0:-141245
          • 1:-34516
        • id:slope
        • crs:EPSG:4326
          • 0:0.002245788210298804
          • 1:0
          • 2:137.20418491999513
          • 3:0
          • 4:-0.002245788210298804
          • 5:-17.514902252120372
          • type:PixelType
          • max:90
          • min:0
          • precision:float
      • id:1586
      • cc:AUS
      • composite_type:3Day
      • countries:Australia
      • dfo_centroid_x:143.6978
      • dfo_centroid_y:-31.268059
      • dfo_country:Australia
      • dfo_dead:1
      • dfo_displaced:200
      • dfo_main_cause:Monsoonal rain
      • dfo_other_country:NA
      • dfo_severity:2
      • dfo_validation_type:News
      • gfd_country_code:['AS']
      • gfd_country_name:['AUSTRALIA']
      • glide_index:NA
      • otsu_sample_res:231.66
      • slope_threshold:5
      • system:asset_size:8988914
        • type:LinearRing
            • 0:148.77567883770345
            • 1:-37.692190384571404
            • 0:149.9754567941598
            • 1:-37.692185550952246
            • 0:149.9749728427817
            • 1:-17.513770729195038
            • 0:149.17465716800353
            • 1:-17.51377546120134
            • 0:147.97772223305088
            • 1:-17.512579461230274
            • 0:146.3818089465066
            • 1:-17.512579422994158
            • 0:144.7858956959944
            • 1:-17.512579420723743
            • 0:143.1899824714047
            • 1:-17.512579400063284
            • 0:141.59406922326394
            • 1:-17.512579402044587
            • 0:139.99815600423668
            • 1:-17.5125794432258
            • 0:138.40224272344366
            • 1:-17.512579385808014
            • 0:137.20294874528088
            • 1:-17.51377068144409
            • 0:137.20246484353038
            • 1:-37.69218554664975
            • 0:139.59917774865693
            • 1:-37.692190354915816
            • 0:141.1950909277643
            • 1:-37.69219034319341
            • 0:143.58896079571315
            • 1:-37.69219036366114
            • 0:145.18487408134828
            • 1:-37.69219039048359
            • 0:147.17976561083674
            • 1:-37.692190365837604
            • 0:148.77567883770345
            • 1:-37.692190384571404
      • system:index:DFO_1586_From_20000218_to_20000301
      • system:time_end:951868800000
      • system:time_start:950832000000
      • threshold_b1b2:0.711
      • threshold_b7:1815.18
      • threshold_type:otsu
In [ ]:
def to_features(image: ee.Image):
    # Reduce resolution of bands to a common scale
    image = image.addBands(
        image.select([*landsat_interest_bands, "z", "slope"]).reduceResolution(
            reducer=ee.Reducer.mean()
        ),
        overwrite=True,
    )
    return image.stratifiedSample(
        classBand="flooded",
        numPoints=50,
        dropNulls=True,
        scale=250,
        tileScale=4,
        region=image.geometry(),
        geometries=False,
        projection=image.projection(),
    )


dataset = images.map(to_features).flatten()
In [ ]:
# geemap.ee_export_vector_to_drive(dataset, description="flood_dataset", fileFormat="csv")

Extract the details of flood events

In [ ]:
interest_properties = {
    "system:index": "id",
    "dfo_country": "primary_country",
    "dfo_severity": "severity",
    "system:time_start": "start_date",
    "system:time_end": "end_date",
    "dfo_centroid_x": "center_lon",
    "dfo_centroid_y": "center_lat",
    "dfo_main_cause": "main_cause",
    "dfo_dead": "dead",
    "dfo_displaced": "displaced",
}


def extract_properties(image):
    old_keys = ee.List(list(interest_properties.keys()))
    new_keys = ee.List(list(interest_properties.values()))
    props: ee.Dictionary = image.toDictionary(old_keys).rename(old_keys, new_keys, True)
    mask = image.select("flooded").gt(0)
    area = (
        ee.Image.pixelArea()
        .mask(mask)
        .reduceRegion(
            reducer=ee.Reducer.sum(),
            scale=250,
            geometry=image.geometry(),
            maxPixels=412598311,
            crs=image.projection(),
        )
    ).get("area")
    props = props.set("area", area)
    return ee.Feature(None, props)


flood_properties = ee.FeatureCollection(flood_collection.map(extract_properties))
In [ ]:
# geemap.ee_to_csv(flood_properties, filename="../data/asg3/flood_props.csv")

Data Cleaning¶

Checking for data distribution and null entry

In [ ]:
df_flood_props = pd.read_csv("../data/asg3/flood_props.csv", index_col="id").reset_index()
display(df_flood_props.head(), df_flood_props.describe(), df_flood_props.isna().sum())
id area center_lat center_lon dead displaced end_date main_cause primary_country severity start_date
0 DFO_1586_From_20000218_to_20000301 3.167333e+08 -31.268059 143.697800 1 200 951868800000 Monsoonal rain Australia 2.0 950832000000
1 DFO_1587_From_20000217_to_20000311 2.502000e+08 -15.782624 47.295670 200 800000 952732800000 Tropical cyclone Madagascar 1.0 950745600000
2 DFO_1595_From_20000405_to_20000425 8.910377e+07 46.763746 22.415404 10 623 956620800000 Snowmelt Romania 2.0 954892800000
3 DFO_1614_From_20000711_to_20000810 4.397571e+09 11.242567 105.063841 33 25000 965865600000 Monsoonal rain Thailand 1.0 963273600000
4 DFO_1627_From_20000830_to_20000910 9.194628e+08 43.773883 132.057679 31 14140 968544000000 Tropical cyclone China 1.0 967593600000
area center_lat center_lon dead displaced end_date severity start_date
count 9.130000e+02 913.000000 913.000000 913.000000 9.130000e+02 9.130000e+02 913.000000 9.130000e+02
mean 6.953646e+09 15.395235 38.340958 180.046002 2.674868e+05 1.224903e+12 1.297371 1.223187e+12
std 1.188170e+10 24.050427 78.687131 3332.691202 1.995806e+06 1.472525e+11 0.392520 1.474857e+11
min 0.000000e+00 -45.953281 -156.215507 0.000000 0.000000e+00 9.518688e+11 1.000000 9.507456e+11
25% 7.060815e+08 0.058024 -13.967824 0.000000 3.000000e+01 1.103328e+12 1.000000 1.102291e+12
50% 2.223036e+09 18.877688 55.552555 5.000000 3.500000e+03 1.197245e+12 1.000000 1.195085e+12
75% 7.381296e+09 32.886368 103.415798 27.000000 4.000000e+04 1.330301e+12 1.500000 1.329696e+12
max 9.854770e+10 68.000870 178.075692 100000.000000 4.000000e+07 1.544400e+12 2.000000 1.543968e+12
id                 0
area               0
center_lat         0
center_lon         0
dead               0
displaced          0
end_date           0
main_cause         0
primary_country    0
severity           0
start_date         0
dtype: int64
In [ ]:
df_flood_data = pd.read_csv("../data/asg3/flood_dataset.csv", index_col="system:index").reset_index()
display(df_flood_data.head(), df_flood_data.describe(), df_flood_data.isna().sum())
system:index SR_B1 SR_B2 SR_B3 SR_B4 SR_B5 SR_B7 clear_perc duration flooded slope z .geo
0 DFO_1586_From_20000218_to_20000301_0 38182.0 29077.0 34219.0 22596.0 21604.0 18202.0 1.0 0 0 1.317596 119.0 {"type":"MultiPoint","coordinates":[]}
1 DFO_1586_From_20000218_to_20000301_1 10304.0 12775.0 16730.0 20689.0 24540.0 21194.0 1.0 0 0 0.154581 151.0 {"type":"MultiPoint","coordinates":[]}
2 DFO_1586_From_20000218_to_20000301_2 14348.0 17113.0 19445.0 20881.0 24473.0 22067.0 1.0 0 0 0.000000 29.0 {"type":"MultiPoint","coordinates":[]}
3 DFO_1586_From_20000218_to_20000301_3 65535.0 65535.0 65535.0 31902.0 25274.0 20551.0 1.0 0 0 1.292620 47.0 {"type":"MultiPoint","coordinates":[]}
4 DFO_1586_From_20000218_to_20000301_4 12960.0 13637.0 14640.0 16029.0 16719.0 14145.0 1.0 0 0 0.240548 65.0 {"type":"MultiPoint","coordinates":[]}
SR_B1 SR_B2 SR_B3 SR_B4 SR_B5 SR_B7 clear_perc duration flooded slope z
count 80354.000000 80354.000000 80354.000000 80354.000000 80354.000000 80354.000000 80354.000000 80354.000000 80354.000000 80354.000000 80354.000000
mean 20092.456841 19307.969585 20190.967046 19047.787428 15720.140640 13064.029526 0.983093 2.359957 0.487891 1.249266 385.527740
std 18827.824469 16747.044276 17624.727589 9175.595795 7648.333485 4669.589228 0.028030 5.200521 0.499856 2.410576 761.573009
min 1347.000000 3866.500000 5128.500000 5953.000000 6992.500000 7201.000000 0.625000 0.000000 0.000000 0.000000 -123.000000
25% 9200.000000 9996.000000 9938.000000 12922.750000 10818.000000 9401.625000 0.972222 0.000000 0.000000 0.221696 26.000000
50% 10993.000000 12044.000000 12450.000000 17228.750000 14452.000000 11975.000000 1.000000 0.000000 0.000000 0.490689 135.000000
75% 18317.000000 18206.375000 19191.750000 22198.000000 18196.875000 15736.000000 1.000000 3.000000 1.000000 1.305663 372.000000
max 65535.000000 65535.000000 65535.000000 65535.000000 65535.000000 65535.000000 1.000000 145.000000 1.000000 50.952850 6616.000000
system:index    0
SR_B1           0
SR_B2           0
SR_B3           0
SR_B4           0
SR_B5           0
SR_B7           0
clear_perc      0
duration        0
flooded         0
slope           0
z               0
.geo            0
dtype: int64
In [ ]:
with open("../data/asg3/countries.geojson", encoding="utf-8") as f:
    countries_boundary = geojson.load(f)

countries_ADM = [
    feature["properties"]["ADMIN"] for feature in countries_boundary["features"]
]

Verify all country is valid with ADM standard

In [ ]:
def check_country_exists_ADM(countries):
    return pd.DataFrame(
        {
            "primary_country": countries,
            "exists_in_ADM": countries.isin(countries_ADM),
        }
    )


df_temp = check_country_exists_ADM(
    pd.Series(df_flood_props["primary_country"].unique())
)
df_temp[df_temp["exists_in_ADM"] == False]
Out[ ]:
primary_country exists_in_ADM
14 USA False
18 Africa False
23 Texas False
24 Columbia False
33 Burma False
41 UK False
66 Bosnia-Herzegovina False
68 Venezulea False
83 Serbia-Montenegro False
92 Democratic Republic of Congo False
93 Tanzania False
94 Guatamala False
97 Serbia False
103 Inda False
106 Tasmania False
109 Gaza False
110 Scotland False
111 The Gambia False
116 Kazahkstan False
117 Uruguay, False
119 Bahamas False

Transform invalid countries' value according to ADM standard

In [ ]:
country_to_adm = {
    "USA": "United States of America",
    "UK": "United Kingdom",
    "Burma": "Myanmar",
    "Tanzania": "United Republic of Tanzania",
    "Columbia": "Colombia",
    "Bosnia-Herzegovina": "Bosnia and Herzegovina",
    "Guatamala": "Guatemala",
    "Serbia": "Republic of Serbia",
    "Africa": "Central African Republic",  # Africa is a continent, not a country
    "Texas": "United States of America",  # Texas is a state in the USA
    "Venezulea": "Venezuela",
    "Serbia-Montenegro": "Republic of Serbia",  # No longer exists as a single country
    "Inda": "India",
    "Democratic Republic of Congo": "Democratic Republic of the Congo",
    "Tasmania": "Australia",  # Tasmania is a state in Australia
    "The Gambia": "Gambia",
    "Scotland": "United Kingdom",  # Scotland is part of the United Kingdom
    "Gaza": "Palestine",  # Gaza is a region in Palestine
    "Kazahkstan": "Kazakhstan",
    "Uruguay,": "Uruguay",
    "Bahamas": "The Bahamas",
}


def process_country_name(country):
    processed_country = country_to_adm.get(country)
    return processed_country if processed_country else country


df_flood_props_cleaned = df_flood_props.copy()
df_flood_props_cleaned["primary_country"] = df_flood_props["primary_country"].apply(
    process_country_name
)
df_temp = check_country_exists_ADM(
    pd.Series(df_flood_props_cleaned["primary_country"].unique())
)
df_temp.sort_values("exists_in_ADM")
Out[ ]:
primary_country exists_in_ADM
0 Australia True
80 Republic of Serbia True
79 Slovakia True
78 Ecuador True
77 Jamaica True
... ... ...
31 Iran True
30 Germany True
29 South Africa True
40 United Kingdom True
110 Mongolia True

111 rows × 2 columns

In [ ]:
# df_flood_props_cleaned.to_csv("../data/asg3/flood_props_cleaned.csv")

Data Visualization¶

In [ ]:
causes = [x.lower() for x in df_flood_props_cleaned["main_cause"].to_list()]

categories = {
    "Heavy rain": ["rain", "monsoon", "torrential"],
    "Dam break/release": ["dam", "levy", "release"],
    "Snowmelt": ["snow"],
    "Ice jam": ["ice"],
    "Tropical storm": ["tropical", "typhoon", "hurricane"],
}

df_flood_cause = pd.DataFrame(
    data={
        "event": list(categories.keys()) + ["Other"],
        "count": np.zeros(len(categories) + 1, dtype=int),
        "matched": [[] for _ in range(len(categories) + 1)],
    }
)

for event in causes:
    matched = False
    for category, keywords in categories.items():
        if any(keyword in event for keyword in keywords):
            idx = df_flood_cause.index[df_flood_cause["event"] == category].tolist()[0]
            df_flood_cause.at[idx, "count"] += 1
            df_flood_cause.at[idx, "matched"].append(event)
            matched = True
    if not matched:
        idx = df_flood_cause.index[df_flood_cause["event"] == "Other"].tolist()[0]
        df_flood_cause.at[idx, "count"] += 1
        df_flood_cause.at[idx, "matched"].append(event)


fig = px.bar(
    df_flood_cause,
    x="event",
    y="count",
    color="count",
    text="count",
)
fig.update_layout(title=dict(text="Main Cause of Flood Events", x=0.5))
fig.show(renderer="notebook")
fig = px.pie(
    df_flood_data["flooded"].value_counts().to_frame().reset_index(),
    values="count",
    names=["Non-water", "Flooded"],
)
fig.update_layout(
    title=dict(text="Ratio of Flooded Area Against Non-Water Area in Dataset", x=0.5),
    margin_b=20,
)
fig.show(renderer="notebook")

Global flood occurrence (only primary influenced country)

In [ ]:
df_flood_by_country = pd.DataFrame(
    df_flood_props_cleaned["primary_country"].value_counts().reset_index(name="counts")
)

scattergeo_trace = go.Scattergeo(
    lat=df_flood_props_cleaned["center_lat"],
    lon=df_flood_props_cleaned["center_lon"],
    mode="markers",
    marker=dict(
        size=12,
        opacity=0.8,
        autocolorscale=False,
        symbol="triangle-down",
        colorscale="Reds",
        cmin=0,
        color=df_flood_props_cleaned["severity"],
        cmax=df_flood_props_cleaned["severity"].max(),
        colorbar_title="Severity",
        colorbar_x=1.2,
    ),
    hoverinfo="skip",
)

choropleth_trace = go.Choropleth(
    locations=df_flood_by_country["primary_country"],
    locationmode="country names",
    z=df_flood_by_country["counts"],
    colorscale="amp",
    colorbar_title="Occurence",
    hoverlabel_namelength=0,
)


fig = go.Figure(data=[scattergeo_trace, choropleth_trace])
fig.data[0].showlegend = False

fig.update_layout(
    title_text="Flood Severity and Occurrence",
    title_x=0.5,
    geo=dict(
        showland=True,
        landcolor="rgb(95, 138, 92)",
        showcountries=True,
        countrycolor="rgb(255, 255, 255)",
        showocean=True,
        oceancolor="rgb(158,202,225)",
        projection_type="orthographic",
    ),
    margin={"b": 15, "l": 10, "r": 0, "t": 70},
    shapes=list(
        [
            dict(
                fillcolor="rgb(95, 138, 92)",
                layer="below",
                line={"dash": "dash"},
                name="Country not primarily <br>affected by flood",
                showlegend=True,
                type="rect",
                xref="paper",
                x0=0.001, 
                x1=0.002,  
                yref="paper",
                y0=0.001,  
                y1=0.002,
            )
        ]
    ),
    legend=dict(
        yanchor="top",
        y=1,
        xanchor="left",
        x=0,
        bgcolor="LightSteelBlue",
        bordercolor="Black",
        borderwidth=1,
    ),
    updatemenus=[
        {
            "buttons": [
                {
                    "args": [
                        {"geo.projection.type": "orthographic"},
                    ],
                    "label": "Orthographic",
                    "method": "relayout",
                },
                {
                    "args": [
                        {"geo.projection.type": "equirectangular"},
                    ],
                    "label": "Equirectangular",
                    "method": "relayout",
                },
            ],
            "direction": "left",
            "showactive": True,
            "type": "buttons",
            "x": 0,
            "xanchor": "left",
            "y": 1.15,
            "yanchor": "top",
        }
    ],
)
fig.show(renderer="notebook")

fig = px.scatter(
    df_flood_props_cleaned,
    x="displaced",
    y="dead",
    size="area",
    color="primary_country",
    hover_name="id",
)
fig.update_layout(
    title=dict(text="Estimated Displaced Against Fatalities Due to Flood Event", x=0.5),
    height=750,
    margin_t=100,
)
fig.show(renderer="notebook")

Select the image of 2010 Pakistan floods for visualizing

In [ ]:
target_flood = "DFO_2507_From_20040620_to_20041007"
max_pixels = 10000
scale = 25000

target_image = (
    images.filter(ee.Filter.eq("system:index", target_flood)).first().unmask()
)

# Extract the image as features (x, y, z, flooded) to investigate the relationship between elevation/slope and flood
target_image_features = ee.FeatureCollection(
    target_image.reduceResolution(reducer=ee.Reducer.mean(), maxPixels=max_pixels)
    .addBands(ee.Image.pixelCoordinates(projection=target_image.projection()))
    .select(["x", "y", "z", "flooded"])
    .reduceRegion(
        ee.Reducer.toCollection(["x", "y", "z", "flooded"]),
        scale=scale,
        geometry=target_image.geometry().bounds(),
        crs=target_image.projection(),
    )
    .get("features")
)

display(target_image)
print(f"Number of points: {target_image_features.size().getInfo()}")
    • type:Image
    • id:GLOBAL_FLOOD_DB/MODIS_EVENTS/V1/DFO_2507_From_20040620_to_20041007
    • version:1685079779514552
        • id:duration
        • crs:EPSG:4326
          • 0:0.002245788210298804
          • 1:0
          • 2:73.38337555972372
          • 3:0
          • 4:-0.002245788210298804
          • 5:32.792999446783135
          • type:PixelType
          • max:65535
          • min:0
          • precision:int
          • 0:11955
          • 1:7630
        • id:clear_perc
        • crs:EPSG:4326
          • 0:0.002245788210298804
          • 1:0
          • 2:73.38337555972372
          • 3:0
          • 4:-0.002245788210298804
          • 5:32.792999446783135
          • type:PixelType
          • precision:float
          • 0:11955
          • 1:7630
        • id:flooded
        • crs:EPSG:4326
          • 0:0.002245788210298804
          • 1:0
          • 2:73.38337555972372
          • 3:0
          • 4:-0.002245788210298804
          • 5:32.792999446783135
          • type:PixelType
          • max:255
          • min:0
          • precision:int
          • 0:11955
          • 1:7630
        • id:SR_B1
        • crs:EPSG:4326
          • 0:0.002245788210298804
          • 1:0
          • 2:73.38337555972372
          • 3:0
          • 4:-0.002245788210298804
          • 5:32.792999446783135
          • type:PixelType
          • max:65535
          • min:0
          • precision:double
        • id:SR_B2
        • crs:EPSG:4326
          • 0:0.002245788210298804
          • 1:0
          • 2:73.38337555972372
          • 3:0
          • 4:-0.002245788210298804
          • 5:32.792999446783135
          • type:PixelType
          • max:65535
          • min:0
          • precision:double
        • id:SR_B3
        • crs:EPSG:4326
          • 0:0.002245788210298804
          • 1:0
          • 2:73.38337555972372
          • 3:0
          • 4:-0.002245788210298804
          • 5:32.792999446783135
          • type:PixelType
          • max:65535
          • min:0
          • precision:double
        • id:SR_B4
        • crs:EPSG:4326
          • 0:0.002245788210298804
          • 1:0
          • 2:73.38337555972372
          • 3:0
          • 4:-0.002245788210298804
          • 5:32.792999446783135
          • type:PixelType
          • max:65535
          • min:0
          • precision:double
        • id:SR_B5
        • crs:EPSG:4326
          • 0:0.002245788210298804
          • 1:0
          • 2:73.38337555972372
          • 3:0
          • 4:-0.002245788210298804
          • 5:32.792999446783135
          • type:PixelType
          • max:65535
          • min:0
          • precision:double
        • id:SR_B7
        • crs:EPSG:4326
          • 0:0.002245788210298804
          • 1:0
          • 2:73.38337555972372
          • 3:0
          • 4:-0.002245788210298804
          • 5:32.792999446783135
          • type:PixelType
          • max:65535
          • min:0
          • precision:double
        • id:z
        • crs:EPSG:4326
          • 0:0.002245788210298804
          • 1:0
          • 2:73.38337555972372
          • 3:0
          • 4:-0.002245788210298804
          • 5:32.792999446783135
          • type:PixelType
          • max:32767
          • min:-32768
          • precision:int
          • 0:160302
          • 1:53434
          • 0:-112827
          • 1:-12115
        • id:slope
        • crs:EPSG:4326
          • 0:0.002245788210298804
          • 1:0
          • 2:73.38337555972372
          • 3:0
          • 4:-0.002245788210298804
          • 5:32.792999446783135
          • type:PixelType
          • max:90
          • min:0
          • precision:float
      • id:2507
      • cc:CHN, IND, MMR, NPL, BGD, BTN
      • composite_type:3Day
      • countries:China, India, Myanmar (Burma), Nepal, Bangladesh, Bhutan
      • dfo_centroid_x:85.891802
      • dfo_centroid_y:25.498908
      • dfo_country:India
      • dfo_dead:3000
      • dfo_displaced:40000000
      • dfo_main_cause:Monsoonal rain
      • dfo_other_country:Bangladesh
      • dfo_severity:2
      • dfo_validation_type:News
      • gfd_country_code:['TH', 'BM', 'BG', 'IN', 'CH', 'BT', 'UU', 'NP']
      • gfd_country_name:['THAILAND', 'BURMA', 'BANGLADESH', 'INDIA', 'CHINA', 'BHUTAN', 'In dispute BHUTAN/CHINA', 'In dispute CHINA/INDIA', 'NEPAL', 'In dispute INDIA/NEPAL']
      • glide_index:NA
      • slope_threshold:5
      • system:asset_size:58741567
        • type:LinearRing
            • 0:91.00228587710755
            • 1:32.79412626840574
            • 0:87.6465168385273
            • 1:32.7941262869687
            • 0:83.45180557365863
            • 1:32.794126257013616
            • 0:80.09603652705451
            • 1:32.794126296226565
            • 0:75.90132526971159
            • 1:32.79412626260718
            • 0:73.38182225783144
            • 1:32.79412176803111
            • 0:73.38216204890047
            • 1:15.656504148386379
            • 0:75.48185411728672
            • 1:15.650523501363097
            • 0:79.25709428358672
            • 1:15.646932740263926
            • 0:83.0323343845583
            • 1:15.65052350640998
            • 0:85.54916121803853
            • 1:15.65531155198055
            • 0:88.0659879609319
            • 1:15.650523486946916
            • 0:91.8412281293297
            • 1:15.646932731827567
            • 0:95.6164683333735
            • 1:15.650523541996755
            • 0:98.13329506611277
            • 1:15.655311500554676
            • 0:100.2329870589596
            • 1:15.653303161209505
            • 0:100.23332694632873
            • 1:32.79412181427747
            • 0:93.93858380512515
            • 1:32.79412626662085
            • 0:91.00228587710755
            • 1:32.79412626840574
      • system:index:DFO_2507_From_20040620_to_20041007
      • system:time_end:1097107200000
      • system:time_start:1087689600000
      • threshold_b1b2:0.7
      • threshold_b7:675
      • threshold_type:standard
Number of points: 9120
In [ ]:
# geemap.ee_to_csv(
#     target_image_features, filename=f"../data/asg3/{target_flood}_data.csv"
# )

Plot 3D DEM of target image with flood area colored

In [ ]:
target_image_data = pd.read_csv(f"../data/asg3/{target_flood}_data.csv").to_numpy()

flooded = target_image_data[:, 0]
x = target_image_data[:, 1]
y = target_image_data[:, 2]
z = target_image_data[:, -1]

xmin, xmax = np.min(x), np.max(x)
ymin, ymax = np.min(y), np.max(y)
xstep, ystep = (
    np.round((xmax - xmin) / np.unique(x).shape).item(),
    np.round((ymax - ymin) / np.unique(y).shape).item(),
)
u = np.arange(start=xmin, stop=xmax, step=xstep)
v = np.arange(start=ymin, stop=ymax, step=ystep)

X, Y = np.meshgrid(u, v)
Z = scipy.interpolate.griddata(
    (x, y), z, (X, Y), method="nearest", fill_value=0, rescale=True
)
F = scipy.interpolate.griddata(
    (x, y), flooded, (X, Y), method="nearest", fill_value=0, rescale=True
)
Fmin, Fmax = np.min(F), np.max(F)
fig = go.Figure(
    data=[
        go.Surface(
            z=Z, colorscale="Plotly3", showscale=True, colorbar_title="Elevation"
        ),
        go.Surface(
            z=Z + 150,
            surfacecolor=F,
            cmin=Fmin,
            cmax=Fmax,
            colorscale=[
                [0, "rgba(7, 148, 242, 0.0)"],
                [0.3, "rgba(7, 148, 242, 1.0)"],
                [1, "rgba(7, 148, 242, 1.0)"],
            ],
            colorbar=dict(title="Flood mean", x=-0.1),
        ),
    ]
)
fig.update_traces(
    contours_z=dict(
        show=True,
        usecolormap=True,
        project_z=True,
        color="rgba(255, 0, 0, 0.5)",
    )
)
fig.update_layout(
    scene_camera_eye=dict(x=1.5, y=1.5, z=1.25),
    margin=dict(l=0, r=0, t=0, b=30),
    title=dict(text=f"3D DEM of {target_flood}", x=0.5, y=0.95),
    updatemenus=[
        dict(
            type="buttons",
            buttons=[
                dict(
                    label="Toggle Elevation Surface",
                    method="update",
                    args=[
                        {
                            "visible": [True, True],
                            "contours.z.usecolormap": [True, True],
                        },
                    ],
                    args2=[
                        {
                            "visible": [False, True],
                            "contours.z.usecolormap": [False, False],
                        },
                    ],
                ),
            ],
            direction="left",
            showactive=True,
            x=-0.1,
            xanchor="left",
            y=1.1,
            yanchor="top",
        )
    ],
)
fig.show(renderer="notebook")

Visualize dataset and target flood event with satellite view

In [ ]:
vis_params = {"min": 0, "max": 10, "palette": ["c3effe", "1341e8", "051cb0", "001133"]}

Map.addLayer(
    flood_collection.select("jrc_perm_water").sum().gte(1).selfMask(),
    {"min": 0, "max": 1, "palette": "c3effe"},
    "JRC Permanent Water",
)

Map.addLayer(
    flood_collection.select("flooded").sum().selfMask(),
    vis_params,
    "GFD Satellite Observed Flood Plain",
)

Map.add_colorbar_branca(
    colors=vis_params["palette"],
    vmin=vis_params["min"],
    vmax=vis_params["max"],
    layer_name="",
    caption="Flood occurrence",
    discrete=True,
)

Map.addLayer(
    images.select("flooded").sum().selfMask(),
    vis_params,
    "Masked GFD Satellite Observed Flood Plain",
)

Map.addLayer(
    target_image.select("flooded").selfMask(),
    vis_params,
    "Selected flood event",
)
Map.addLayer(
    ee.FeatureCollection(ee.Feature(target_image.geometry().bounds())).style(
        color="red", fillColor="00000000"
    ),
    {},
    "Selected flood event boundary",
)

Map
Out[ ]:
Map(center=[0, 0], controls=(WidgetControl(options=['position', 'transparent_bg'], widget=SearchDataGUI(childr…

Feature Engineering¶

Initial selected features

In [ ]:
features = {
    "SR_B1": "Blue",
    "SR_B2": "Green",
    "SR_B3": "Red",
    "SR_B4": "NIR",
    "SR_B5": "SWIR_1",
    "SR_B7": "SWIR_2",
    "slope": "Slope",
    "z": "Elevation",
}
x = df_flood_data[features.keys()].rename(columns=features)
y = df_flood_data["flooded"]

Feature Creation¶

Transformed Index

  • NDWI - Normalized Difference Water Index
  • MBI - Modified Bare Soil Index
  • NDVI - Normalized Difference Vegetation Index
  • WRI - Water Ratio Index
  • AWEI - Automated Water Extraction Index
  • SI - Shadow Index
  • NWI - New Water Index
In [ ]:
x["NDWI"] = (x["Green"] - x["NIR"]) / (x["Green"] + x["NIR"])
x["MBI"] = (
    (x["SWIR_1"] - x["SWIR_2"] - x["NIR"]) / (x["SWIR_1"] + x["SWIR_2"] + x["NIR"])
) + 0.5
x["NDVI"] = (x["NIR"] - x["Red"]) / (x["NIR"] + x["Red"])
x["WRI"] = (x["Green"] + x["Red"]) / (x["NIR"] + x["SWIR_2"])
x["AWEI"] = (
    x["Blue"] + 2.5 * x["Green"] - 1.5 * (x["NIR"] + x["SWIR_1"]) - 0.25 * x["SWIR_2"]
)
x["SI"] = (1 - x["Red"]) * (1 - x["Blue"]) * (1 - x["Green"])
x["NWI"] = (x["Blue"] - (x["NIR"] + x["SWIR_1"] + x["SWIR_2"])) / (
    x["Blue"] + (x["NIR"] + x["SWIR_1"] + x["SWIR_2"])
)

Correlation heatmap of initial selected features

In [ ]:
corr = x.corr().round(4)
fig = px.imshow(corr, text_auto=True, aspect="auto")
fig.update_layout(title=dict(text="Correlation of Features", x=0.5))
fig.show(renderer="notebook")

Use LightGBM as base model for SHAP

In [ ]:
forest = lgb.LGBMClassifier(
    objective="binary",
    n_estimators=1000,
    learning_rate=0.01,
    deterministic=True,
    device_type="cpu",
    verbose=0,
)
forest
Out[ ]:
LGBMClassifier(deterministic=True, device_type='cpu', learning_rate=0.01,
               n_estimators=1000, objective='binary', verbose=0)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LGBMClassifier(deterministic=True, device_type='cpu', learning_rate=0.01,
               n_estimators=1000, objective='binary', verbose=0)
In [ ]:
X_train, X_test, y_train, y_test = cuml_train_test_split(
    x, y, test_size=0.3, shuffle=True, stratify=y
)

forest.fit(X_train.to_numpy(), y_train.to_numpy())

full_clustering_path = Path("../data/asg3/f_clustering.npy")
shap_values_path = Path("../data/asg3/shap_values.npy")

if full_clustering_path.exists():
    clustering = np.load(full_clustering_path)
else:
    clustering = shap.utils.hclust(X_train.to_numpy(), y_train.to_numpy())
    np.save(full_clustering_path, clustering)

if shap_values_path.exists():
    shap_values = np.load(shap_values_path)
else:
    masker = shap.maskers.Partition(X_train.to_numpy(), clustering=clustering)
    explainer = PermutationExplainer(
        forest.predict_proba, masker, data=X_train, masker_type="partition"
    )
    shap_values = explainer.shap_values(X_test.to_numpy())
    np.save(shap_values_path, shap_values)

All Features

In [ ]:
expected_value = np.mean(forest.predict_proba(X_test.to_numpy()), axis=0)

explanation = shap.Explanation(
    shap_values,
    data=X_test.to_numpy(),
    base_values=np.full(shape=np.sum(shap_values, axis=1).shape, fill_value=expected_value),
    feature_names=list(x.columns.values),
)

fig, axes = plt.subplots(1,2)

axes[0].set_title("Feature Importance and Clustering \nBased on SHAP Values")
shap.plots.bar(explanation[...,1], clustering=clustering, max_display=len(x.columns), ax=axes[0], show=False)

shap.decision_plot(expected_value[1], shap_values[...,1][:10], feature_names=list(x.columns.values), show=False)
axes[1] = plt.gca()
axes[1].set_title("Complexity of Model for Making Prediction")
fig.set_dpi(1000)
fig.suptitle("Analysis for All Features", fontsize=16)
fig.tight_layout()
fig.set_size_inches((10, 8))
No description has been provided for this image

Selected Features:

  • AWEI
  • NIR
  • "Slope
  • "Elevation
In [ ]:
final_features = ["AWEI", "NIR", "Slope", "Elevation"]

X_train, X_temp, y_train, y_temp = train_test_split(
    x[final_features], y, test_size=0.3, stratify=y
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, stratify=y_temp
)

np.savez(
    "../data/asg3/complete_data1.npz",
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    X_val=X_val,
    y_val=y_val,
)
In [ ]:
data = np.load("../data/asg3/complete_data1.npz")
(
    X_train,
    X_test,
    y_train,
    y_test,
    X_val,
    y_val,
) = (
    data["X_train"],
    data["X_test"],
    data["y_train"],
    data["y_test"],
    data["X_val"],
    data["y_val"],
)

train_data = lgb.Dataset(X_train, y_train, params={"feature_pre_filter": False})
In [ ]:
forest.fit(X_train, y_train)

explainer = shap.GPUTreeExplainer(forest)
clustering = shap.utils.hclust(X_train, y_train)

explanation = explainer(X_test)
shap_values = explainer.shap_values(X_test)
5it [00:16,  5.63s/it]                       
In [ ]:
fig, axes = plt.subplots(1, 2, figsize=(10, 8))

axes[0].set_title("Feature Importance and Clustering \nBased on SHAP Values")
shap.plots.bar(
    explanation,
    clustering=clustering,
    max_display=len(x.columns),
    ax=axes[0],
    show=False,
)

shap.decision_plot(
    explainer.expected_value,
    shap_values[:10],
    feature_names=final_features,
    link="logit",
    show=False,
    auto_size_plot=False,
)
axes[1] = plt.gca()
axes[1].set_title("Complexity of Model for Making Prediction")
fig.set_dpi(1000)
fig.suptitle("Analysis for Selected Features", fontsize=16)
fig.tight_layout()
fig.set_size_inches((10, 8))
No description has been provided for this image

Model Development & Hyperparameter Tuning¶

In [ ]:
def evaluate_model(y_pred, y_test):
    precision, recall, f1, support = precision_recall_fscore_support(
        y_test, y_pred, average="binary"
    )
    accuracy = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred)
    return [accuracy, precision, recall, f1, auc]

Logistic Regression model as linear model

In [ ]:
logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)

lr_y_pred = logistic_model.predict(X_test)
lr_results = evaluate_model(lr_y_pred, y_test)
lr_results

LightGBM Model as non-linear model

In [ ]:
lgb_model = lgb.cv(
    {
        "objective": "binary",
        "metric": "auc",
        "device_type": "cpu",
        "verbose": -1,
        "boosting_type": "gbdt",
        "deterministic": True,
    },
    train_data,
    nfold=5,
    stratified=True,
    shuffle=True,
    metrics="auc",
    return_cvbooster=True,
)
# Predict and evaluate
lgb_y_pred = (lgb_model["cvbooster"].predict(X_test)[0] > 0.5).astype("int")
lgb_results = evaluate_model(lgb_y_pred, y_test)
lgb_results

TabNet as deep learning model

In [ ]:
device = "cuda" if torch.cuda.is_available() else "cpu"
tabnet = TabNetClassifier(device_name="cuda", verbose=0)
tabnet.fit(X_train, y_train, eval_set=[((X_val,y_val))])

tabnet_y_pred = tabnet.predict(X_test)
tabnet_results = evaluate_model(tabnet_y_pred, y_test)

Initial result comparison for model selection

In [ ]:
metrics_df = pd.DataFrame(
    {
        "Metric": ["Accuracy", "Precision", "Recall", "F1 Score", "AUC"],
        "Logistic Regression": lr_results,
        "LightGBM": lgb_results,
        "TabNet": tabnet_results,
    }
)

metrics_df.to_csv("../data/asg3/initial_results.csv", index=False)

metrics_df.style.background_gradient(axis=1)
Out[ ]:
  Metric Logistic Regression LightGBM TabNet
0 Accuracy 0.787208 0.854405 0.848183
1 Precision 0.792210 0.874955 0.865416
2 Recall 0.764326 0.818568 0.815678
3 F1 Score 0.778018 0.845823 0.839811
4 AUC 0.786666 0.853558 0.847414

Since LightGBM is the best model in term of selected metrics and computation time, it is used as final model for hypermeter tuning and classification

In [ ]:
cv_study = optuna.create_study(
    study_name="LightGBM_tuner_cv",
    storage="sqlite:///../data/asg3/lightgbm_tuner_cv.db",
    direction="maximize",
    load_if_exists=True,
)


tuner_pickle_path = Path("../data/asg3/lightgbm_tuner_cv.pkl")

if tuner_pickle_path.exists():
    # Load the tuner from the pickle file
    with open(tuner_pickle_path, "rb") as f:
        tuner = pickle.load(f)
else:
    tuner = LightGBMTunerCV(
        {
            "objective": "binary",
            "metric": "auc",
            "device_type": "cpu",
            "verbose": -1,
            "boosting_type": "gbdt",
            "deterministic": True,
        },
        train_data,
        study=cv_study,
        nfold=5,
        stratified=True,
        shuffle=True,
        verbosity=-1,
        show_progress_bar=True,
        return_cvbooster=True,
        model_dir="../data/asg3/models",
    )
    tuner.run()
    with open(tuner_pickle_path, "wb") as f:
        pickle.dump(tuner, f)

tuned_lgb = tuner.get_best_booster()
tuned_lgb_y_pred = (tuned_lgb.predict(X_test)[0] > 0.5).astype("int")
tuned_lgb_results = evaluate_model(tuned_lgb_y_pred, y_test)
  0%|          | 0/7 [00:00<?, ?it/s]
  0%|          | 0/20 [00:00<?, ?it/s]
  0%|          | 0/10 [00:00<?, ?it/s]
  0%|          | 0/6 [00:00<?, ?it/s]
  0%|          | 0/20 [00:00<?, ?it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

Final result comparison

In [ ]:
final_results = pd.read_csv("../data/asg3/initial_results.csv")
final_results["Tuned_LightGBM"] = tuned_lgb_results

final_results.to_csv("../data/asg3/final_results.csv")
final_results.style.background_gradient(axis=1)
Out[ ]:
  Metric Logistic Regression LightGBM TabNet Tuned_LightGBM
0 Accuracy 0.787208 0.854405 0.848183 0.864858
1 Precision 0.792210 0.874955 0.865416 0.882374
2 Recall 0.764326 0.818568 0.815678 0.834212
3 F1 Score 0.778018 0.845823 0.839811 0.857617
4 AUC 0.786666 0.853558 0.847414 0.864133

Model Evaluation¶

Draw a shape on map (preferable rectangle) or use existing Area of Interest (ROI)

In [ ]:
Map
Out[ ]:
Map(bottom=1996.0, center=[29.53522956294847, 80.77148437500001], controls=(WidgetControl(options=['position',…

This cell will draw back the last ROI geometry and reuse the data if exists

In [ ]:
bands = ["SR_B1", "SR_B2", "SR_B4", "SR_B5", "SR_B7", "z", "slope", "flooded"]

roi_geo_path = Path("../data/asg3/roi_geo.npy")

if Map.draw_last_feature:
    geometry = Map.draw_last_feature.geometry()
    bounds = geemap.ee_to_bbox(geometry)
    np.save(roi_geo_path, bounds)
    reduced_target_image = target_image.reduceResolution(
        reducer=ee.Reducer.mean(),
    )

    np.save(
        "../data/asg3/roi",
        geemap.ee_to_numpy(
            reduced_target_image,
            region=geometry,
            scale=250,
            bands=bands,
        ),
    )
    Map.remove_drawn_features()
else:
    if roi_geo_path.exists():
        bounds = np.load(roi_geo_path).tolist()
        roi_geo = ee.Geometry.Rectangle(coords=bounds)
        Map.centerObject(roi_geo, zoom=10)
        roi_geo = ee.FeatureCollection(roi_geo).style(
            fillColor="#3181cc33", color="red", width=1
        )
        Map.addLayer(roi_geo, {}, "ROI Geometry")

Transform pixel values within ROI into dataset and perform classification with confidence level of 0.95

In [ ]:
data: np.ndarray = np.load("../data/asg3/roi.npy")
rows, cols, bands_len = data.shape

array_2d = data.reshape(-1, bands_len)
coordinates = [(row, col) for row in range(rows) for col in range(cols)]
data_with_coords = [
    {"x": x, "y": y, **{bands[i]: array_2d[idx, i] for i in range(bands_len)}}
    for idx, (x, y) in enumerate(coordinates)
]

features = {
    "SR_B1": "Blue",
    "SR_B2": "Green",
    "SR_B4": "NIR",
    "SR_B5": "SWIR_1",
    "SR_B7": "SWIR_2",
    "slope": "Slope",
    "z": "Elevation",
}

df_target_image = pd.DataFrame(data_with_coords).rename(columns=features)
x = df_target_image[features.values()]
y = df_target_image["flooded"]
x["AWEI"] = (
    x["Blue"] + 2.5 * x["Green"] - 1.5 * (x["NIR"] + x["SWIR_1"]) - 0.25 * x["SWIR_2"]
)

confidence = 0.95

y_pred: np.ndarray = (tuned_lgb.predict(x[final_features])[0] > confidence).astype("int")
df_target_image["pred_flooded"] = y_pred

Plot the results as side by side image

In [ ]:
def get_grid(df, label):
    grid_x = df["x"].max() + 1
    grid_y = df["y"].max() + 1
    # Initialize an empty grid
    grid = np.zeros((grid_x, grid_y, 3), dtype=np.float32)

    # Populate the grid with colors
    for _, row in df.iterrows():
        x, y, color = int(row["x"]), int(row["y"]), row[label]
        grid[x, y] = color
    return grid


grid_ori = get_grid(df_target_image, "flooded")
grid_pred = get_grid(df_target_image, "pred_flooded")

fig = px.imshow(
    np.array((grid_ori, grid_pred)),
    facet_col=0,
    aspect="auto",
    height=600,
)

item_map = {
    f"{i}": key for i, key in enumerate(["Original", "Prediction (Confidence=0.95)"])
}
fig.for_each_annotation(lambda a: a.update(text=item_map[a.text.split("=")[1]]))
fig.update_layout(
    yaxis=dict(scaleanchor="x", scaleratio=1.5),
    title="Comparison of Original and Predicted Flooded Areas",
    title_x=0.5,
)
fig.show(renderer="notebook")